Libraries

library(ggplot2)
library(tidyverse)
## -- Attaching packages --------------------------------------- tidyverse 1.3.0 --
## <U+221A> tibble  3.0.4     <U+221A> dplyr   1.0.2
## <U+221A> tidyr   1.1.2     <U+221A> stringr 1.4.0
## <U+221A> readr   1.4.0     <U+221A> forcats 0.5.0
## <U+221A> purrr   0.3.4
## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()
library(data.table)
## 
## Attaching package: 'data.table'
## The following objects are masked from 'package:dplyr':
## 
##     between, first, last
## The following object is masked from 'package:purrr':
## 
##     transpose
library(plotly)
## 
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
## 
##     last_plot
## The following object is masked from 'package:stats':
## 
##     filter
## The following object is masked from 'package:graphics':
## 
##     layout
library(fastDummies)
library(reshape2)
## 
## Attaching package: 'reshape2'
## The following objects are masked from 'package:data.table':
## 
##     dcast, melt
## The following object is masked from 'package:tidyr':
## 
##     smiths
library(metan)
## Registered S3 method overwritten by 'GGally':
##   method from   
##   +.gg   ggplot2
## |========================================================|
## | Multi-Environment Trial Analysis (metan) v1.11.0       |
## | Author: Tiago Olivoto                                  |
## | Type 'citation('metan')' to know how to cite metan     |
## | Type 'vignette('metan_start')' for a short tutorial    |
## | Visit 'https://bit.ly/2TIq6JE' for a complete tutorial |
## |========================================================|
## 
## Attaching package: 'metan'
## The following object is masked from 'package:data.table':
## 
##     :=
## The following object is masked from 'package:forcats':
## 
##     as_factor
## The following object is masked from 'package:tidyr':
## 
##     replace_na
require(caTools)
## Loading required package: caTools
library(rpart)
library(gensvm)
library(caret)
## Loading required package: lattice
## 
## Attaching package: 'caret'
## The following object is masked from 'package:purrr':
## 
##     lift
library(outliers)

EDA

The aim of this study is to examine the relationships and distributions of variables using the Diamonds dataset available in R and finally develop a meaningful price estimation model. You can access and review the libraries used in this study from the Libraries section.

data <- diamonds

Including Plots

There are 53940 rows and 10 columns in diamonds dataset

dim(data)
## [1] 53940    10
head(data)
## # A tibble: 6 x 10
##   carat cut       color clarity depth table price     x     y     z
##   <dbl> <ord>     <ord> <ord>   <dbl> <dbl> <int> <dbl> <dbl> <dbl>
## 1 0.23  Ideal     E     SI2      61.5    55   326  3.95  3.98  2.43
## 2 0.21  Premium   E     SI1      59.8    61   326  3.89  3.84  2.31
## 3 0.23  Good      E     VS1      56.9    65   327  4.05  4.07  2.31
## 4 0.290 Premium   I     VS2      62.4    58   334  4.2   4.23  2.63
## 5 0.31  Good      J     SI2      63.3    58   335  4.34  4.35  2.75
## 6 0.24  Very Good J     VVS2     62.8    57   336  3.94  3.96  2.48
data %>%glimpse()
## Rows: 53,940
## Columns: 10
## $ carat   <dbl> 0.23, 0.21, 0.23, 0.29, 0.31, 0.24, 0.24, 0.26, 0.22, 0.23,...
## $ cut     <ord> Ideal, Premium, Good, Premium, Good, Very Good, Very Good, ...
## $ color   <ord> E, E, E, I, J, J, I, H, E, H, J, J, F, J, E, E, I, J, J, J,...
## $ clarity <ord> SI2, SI1, VS1, VS2, SI2, VVS2, VVS1, SI1, VS2, VS1, SI1, VS...
## $ depth   <dbl> 61.5, 59.8, 56.9, 62.4, 63.3, 62.8, 62.3, 61.9, 65.1, 59.4,...
## $ table   <dbl> 55, 61, 65, 58, 58, 57, 57, 55, 61, 61, 55, 56, 61, 54, 62,...
## $ price   <int> 326, 326, 327, 334, 335, 336, 336, 337, 337, 338, 339, 340,...
## $ x       <dbl> 3.95, 3.89, 4.05, 4.20, 4.34, 3.94, 3.95, 4.07, 3.87, 4.00,...
## $ y       <dbl> 3.98, 3.84, 4.07, 4.23, 4.35, 3.96, 3.98, 4.11, 3.78, 4.05,...
## $ z       <dbl> 2.43, 2.31, 2.31, 2.63, 2.75, 2.48, 2.47, 2.53, 2.49, 2.39,...

We can see all data summary like median, mean and quartiles

summary(data)
##      carat               cut        color        clarity          depth      
##  Min.   :0.2000   Fair     : 1610   D: 6775   SI1    :13065   Min.   :43.00  
##  1st Qu.:0.4000   Good     : 4906   E: 9797   VS2    :12258   1st Qu.:61.00  
##  Median :0.7000   Very Good:12082   F: 9542   SI2    : 9194   Median :61.80  
##  Mean   :0.7979   Premium  :13791   G:11292   VS1    : 8171   Mean   :61.75  
##  3rd Qu.:1.0400   Ideal    :21551   H: 8304   VVS2   : 5066   3rd Qu.:62.50  
##  Max.   :5.0100                     I: 5422   VVS1   : 3655   Max.   :79.00  
##                                     J: 2808   (Other): 2531                  
##      table           price             x                y         
##  Min.   :43.00   Min.   :  326   Min.   : 0.000   Min.   : 0.000  
##  1st Qu.:56.00   1st Qu.:  950   1st Qu.: 4.710   1st Qu.: 4.720  
##  Median :57.00   Median : 2401   Median : 5.700   Median : 5.710  
##  Mean   :57.46   Mean   : 3933   Mean   : 5.731   Mean   : 5.735  
##  3rd Qu.:59.00   3rd Qu.: 5324   3rd Qu.: 6.540   3rd Qu.: 6.540  
##  Max.   :95.00   Max.   :18823   Max.   :10.740   Max.   :58.900  
##                                                                   
##        z         
##  Min.   : 0.000  
##  1st Qu.: 2.910  
##  Median : 3.530  
##  Mean   : 3.539  
##  3rd Qu.: 4.040  
##  Max.   :31.800  
## 

There is no missing values

sum(is.na(data))
## [1] 0

we can see table variable’s distribution, we see that the concentration is mostly around 50-60.

ggplot(data) +
  aes(x = table) +
  geom_histogram(bins = 30L, fill = "#0c4c8a") +
  theme_minimal()

We can examine the normal and logarithmic normal distribution of our target variable.

par(mfrow=c(1,2))
qqnorm((diamonds$price),main="Normal Q-Q Plot of Price");qqline((diamonds$price))
qqnorm(log(diamonds$price),main="Normal Q-Q Plot of log Price");qqline(log(diamonds$price))

When we examine the x, y and z variables, we see that the average is at Fair level in the cut variable

data %>%
  select(cut,price,x,y,z)%>%
  group_by(cut)%>%
  summarise(mean_price = mean(price),mean_x = mean(x),mean_y = mean(y),mean_z = mean(z))%>%
  arrange(mean_price,desc())
## `summarise()` ungrouping output (override with `.groups` argument)
## # A tibble: 5 x 5
##   cut       mean_price mean_x mean_y mean_z
##   <ord>          <dbl>  <dbl>  <dbl>  <dbl>
## 1 Ideal          3458.   5.51   5.52   3.40
## 2 Good           3929.   5.84   5.85   3.64
## 3 Very Good      3982.   5.74   5.77   3.56
## 4 Fair           4359.   6.25   6.18   3.98
## 5 Premium        4584.   5.97   5.94   3.65

When we want to examine the variables with the help of boxplot, we observe that there are outlier values.

data%>%
  select(x,cut)%>%
  ggplot(aes(x,color = cut))+
  geom_boxplot()

data%>%
  select(y,cut)%>%
  ggplot(aes(y,color = cut))+
  geom_boxplot()

data %>%
  ggplot(aes(x=cut,y=price, color=cut)) +
  geom_boxplot()

When the relationship between carat and price is examined, we can say that the price variable also increases as Premium increases.

d <- diamonds[sample(nrow(diamonds), 1000), ]

fig <- plot_ly(
  d, x = ~carat, y = ~price,
  # Hover text:
  text = ~paste("Price: ", price, '$<br>Cut:', cut),
  color = ~carat, size = ~carat
)

fig
## No trace type specified:
##   Based on info supplied, a 'scatter' trace seems appropriate.
##   Read more about this trace type -> https://plot.ly/r/reference/#scatter
## No scatter mode specifed:
##   Setting the mode to markers
##   Read more about this attribute -> https://plot.ly/r/reference/#scatter-mode
## Warning: `arrange_()` is deprecated as of dplyr 0.7.0.
## Please use `arrange()` instead.
## See vignette('programming') for more help
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_warnings()` to see where this warning was generated.
## Warning: `line.width` does not currently support multiple values.
data %>%
  group_by(cut) %>%
  summarise(n=n(), 
            mean= mean(price), 
            median=median(price), 
            Q1= quantile(price,0.25),
            Q3= quantile(price,0.75))
## `summarise()` ungrouping output (override with `.groups` argument)
## # A tibble: 5 x 6
##   cut           n  mean median    Q1    Q3
##   <ord>     <int> <dbl>  <dbl> <dbl> <dbl>
## 1 Fair       1610 4359.  3282  2050. 5206.
## 2 Good       4906 3929.  3050. 1145  5028 
## 3 Very Good 12082 3982.  2648   912  5373.
## 4 Premium   13791 4584.  3185  1046  6296 
## 5 Ideal     21551 3458.  1810   878  4678.
df <- data[,-c(2,3,4)]
df <- rm.outlier(df, fill = TRUE, median = TRUE, opposite = FALSE)
data[,colnames(df)] <- df
head(data)
## # A tibble: 6 x 10
##   carat cut       color clarity depth table price     x     y     z
##   <dbl> <ord>     <ord> <ord>   <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 0.23  Ideal     E     SI2      61.5    55   326  3.95  3.98  2.43
## 2 0.21  Premium   E     SI1      59.8    61   326  3.89  3.84  2.31
## 3 0.23  Good      E     VS1      56.9    65   327  4.05  4.07  2.31
## 4 0.290 Premium   I     VS2      62.4    58   334  4.2   4.23  2.63
## 5 0.31  Good      J     SI2      63.3    58   335  4.34  4.35  2.75
## 6 0.24  Very Good J     VVS2     62.8    57   336  3.94  3.96  2.48

We assign dummies to categorical variables in order to include categorical variables in the model.

data <- dummy_cols(data, select_columns = c("cut","color","clarity"),remove_selected_columns = TRUE)
set.seed(123)
split = sample.split(data$price, SplitRatio = 0.8)
training_set = subset(data, split == TRUE)
test_set = subset(data, split == FALSE)
cormat <- round(cor(training_set),2)

melted_cormat <- melt(cormat)

ggplot(data = melted_cormat, aes(x=Var1, y=Var2, fill=value)) + 
  geom_tile()

# Get lower triangle of the correlation matrix
get_lower_tri<-function(cormat){
  cormat[upper.tri(cormat)] <- NA
  return(cormat)
}
# Get upper triangle of the correlation matrix
get_upper_tri <- function(cormat){
  cormat[lower.tri(cormat)]<- NA
  return(cormat)
}

upper_tri <- get_upper_tri(cormat)

cormat <- reorder_cormat(cormat)
upper_tri <- get_upper_tri(cormat)
# Melt the correlation matrix
melted_cormat <- melt(upper_tri, na.rm = TRUE)
# Create a ggheatmap
ggheatmap <- ggplot(melted_cormat, aes(Var2, Var1, fill = value))+
  geom_tile(color = "white")+
  scale_fill_gradient2(low = "blue", high = "red", mid = "white", 
                       midpoint = 0, limit = c(-1,1), space = "Lab", 
                       name="Pearson\nCorrelation") +
  theme_minimal()+ # minimal theme
  theme(axis.text.x = element_text(angle = 45, vjust = 1, 
                                   size = 12, hjust = 1))+
  coord_fixed()

print(ggheatmap)

ggheatmap + 
  geom_text(aes(Var2, Var1, label = value), color = "black", size = 2) +
  theme(
    axis.title.x = element_blank(),
    axis.title.y = element_blank(),
    panel.grid.major = element_blank(),
    panel.border = element_blank(),
    panel.background = element_blank(),
    axis.ticks = element_blank(),
    legend.justification = c(1, 0),
    legend.position = c(0.6, 0.7),
    legend.direction = "horizontal")+
  guides(fill = guide_colorbar(barwidth = 7, barheight = 1,
                               title.position = "top", title.hjust = 0.5))

We are scaling on the train and test data that we finally reached. After the scale result is regression run, we leave only the significant variables and run the model again.

training_set = scale(training_set)
test_set = scale(test_set)

d1 <- data.frame(training_set)
d2 <- data.frame(test_set)

regressor = lm(formula = price ~ .,
               data = d1)


summary(regressor)
## 
## Call:
## lm(formula = price ~ ., data = d1)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -3.8431 -0.1494 -0.0461  0.0963  6.6026 
## 
## Coefficients: (3 not defined because of singularities)
##                 Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   -3.172e-14  1.341e-03   0.000 1.000000    
## carat          1.336e+00  6.724e-03 198.770  < 2e-16 ***
## depth         -2.038e-02  2.108e-03  -9.670  < 2e-16 ***
## table         -1.472e-02  1.795e-03  -8.199 2.48e-16 ***
## x             -2.954e-01  1.434e-02 -20.607  < 2e-16 ***
## y              3.647e-02  1.074e-02   3.396 0.000685 ***
## z             -2.899e-02  1.111e-02  -2.610 0.009054 ** 
## cut_Fair      -3.245e-02  1.564e-03 -20.756  < 2e-16 ***
## cut_Good      -1.835e-02  1.608e-03 -11.412  < 2e-16 ***
## cut_Very.Good -1.129e-02  1.637e-03  -6.897 5.38e-12 ***
## cut_Premium   -8.152e-03  1.762e-03  -4.627 3.72e-06 ***
## cut_Ideal             NA         NA      NA       NA    
## color_D        1.957e-01  2.382e-03  82.148  < 2e-16 ***
## color_E        2.073e-01  2.640e-03  78.529  < 2e-16 ***
## color_F        1.998e-01  2.608e-03  76.615  < 2e-16 ***
## color_G        1.917e-01  2.725e-03  70.366  < 2e-16 ***
## color_H        1.249e-01  2.480e-03  50.355  < 2e-16 ***
## color_I        6.618e-02  2.181e-03  30.336  < 2e-16 ***
## color_J               NA         NA      NA       NA    
## clarity_I1    -1.516e-01  1.627e-03 -93.145  < 2e-16 ***
## clarity_SI2   -2.503e-01  3.180e-03 -78.730  < 2e-16 ***
## clarity_SI1   -1.821e-01  3.477e-03 -52.367  < 2e-16 ***
## clarity_VS2   -1.155e-01  3.381e-03 -34.169  < 2e-16 ***
## clarity_VS1   -7.057e-02  2.960e-03 -23.841  < 2e-16 ***
## clarity_VVS2  -2.960e-02  2.524e-03 -11.728  < 2e-16 ***
## clarity_VVS1  -2.158e-02  2.266e-03  -9.521  < 2e-16 ***
## clarity_IF            NA         NA      NA       NA    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.2821 on 44210 degrees of freedom
## Multiple R-squared:  0.9205, Adjusted R-squared:  0.9204 
## F-statistic: 2.224e+04 on 23 and 44210 DF,  p-value: < 2.2e-16
d1 <- d1[,-c(7,12,19,27)]
d2 <- d2[,-c(7,12,19,27)]

regressor = lm(formula = price ~ .,
               data = d1)

summary(regressor)
## 
## Call:
## lm(formula = price ~ ., data = d1)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -3.8513 -0.1494 -0.0463  0.0962  6.5954 
## 
## Coefficients:
##                 Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   -3.167e-14  1.341e-03   0.000  1.00000    
## carat          1.337e+00  6.724e-03 198.766  < 2e-16 ***
## depth         -2.390e-02  1.622e-03 -14.736  < 2e-16 ***
## table         -1.474e-02  1.795e-03  -8.213  < 2e-16 ***
## x             -3.154e-01  1.214e-02 -25.982  < 2e-16 ***
## y              2.759e-02  1.019e-02   2.708  0.00677 ** 
## cut_Fair      -3.246e-02  1.564e-03 -20.758  < 2e-16 ***
## cut_Good      -1.828e-02  1.607e-03 -11.370  < 2e-16 ***
## cut_Very.Good -1.134e-02  1.637e-03  -6.926 4.39e-12 ***
## cut_Premium   -8.039e-03  1.761e-03  -4.564 5.03e-06 ***
## color_D        1.957e-01  2.382e-03  82.160  < 2e-16 ***
## color_E        2.074e-01  2.640e-03  78.543  < 2e-16 ***
## color_F        1.998e-01  2.608e-03  76.634  < 2e-16 ***
## color_G        1.918e-01  2.725e-03  70.392  < 2e-16 ***
## color_H        1.250e-01  2.480e-03  50.389  < 2e-16 ***
## color_I        6.622e-02  2.182e-03  30.355  < 2e-16 ***
## clarity_I1    -1.515e-01  1.627e-03 -93.106  < 2e-16 ***
## clarity_SI2   -2.503e-01  3.180e-03 -78.706  < 2e-16 ***
## clarity_SI1   -1.820e-01  3.477e-03 -52.344  < 2e-16 ***
## clarity_VS2   -1.155e-01  3.381e-03 -34.149  < 2e-16 ***
## clarity_VS1   -7.053e-02  2.960e-03 -23.825  < 2e-16 ***
## clarity_VVS2  -2.957e-02  2.524e-03 -11.716  < 2e-16 ***
## clarity_VVS1  -2.155e-02  2.266e-03  -9.509  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.2821 on 44211 degrees of freedom
## Multiple R-squared:  0.9204, Adjusted R-squared:  0.9204 
## F-statistic: 2.325e+04 on 22 and 44211 DF,  p-value: < 2.2e-16
pred <- predict(regressor, newdata = d2)

RMSE(pred = pred, obs = d2$price)
## [1] 0.2932916

References

http://www.sthda.com/english/wiki/ggplot2-quick-correlation-matrix-heatmap-r-software-and-data-visualization